train <- read_csv("data/train.csv")
## Rows: 27674 Columns: 83
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): patient_race, payer_type, patient_state, breast_cancer_diagnosis_c...
## dbl (70): patient_id, patient_zip3, patient_age, bmi, breast_cancer_diagnosi...
## lgl (1): patient_gender
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ncol(train)
nrow(train)
colnames(train)
skim(train)
set.seed(3911)
inTrain <- createDataPartition(y = train$treatment_pd, p=0.7, list=FALSE)
training <- train[inTrain,]
testing <- train[-inTrain,]
glimpse(training)
## Rows: 19,373
## Columns: 83
## $ patient_id <dbl> 994155, 154389, 921275, 235192, …
## $ patient_race <chr> "Asian", NA, "Hispanic", NA, "Wh…
## $ payer_type <chr> "COMMERCIAL", "MEDICARE ADVANTAG…
## $ patient_state <chr> "CA", "OH", "CA", "IN", "NM", "A…
## $ patient_zip3 <dbl> 917, 451, 928, 462, 877, 356, 33…
## $ patient_age <dbl> 46, 63, 50, 35, 58, 37, 67, 45, …
## $ patient_gender <lgl> FALSE, FALSE, FALSE, FALSE, FALS…
## $ bmi <dbl> 27.00, NA, NA, NA, NA, NA, NA, N…
## $ breast_cancer_diagnosis_code <chr> "C50811", "C50412", "1749", "C50…
## $ breast_cancer_diagnosis_desc <chr> "Malignant neoplasm of ovrlp sit…
## $ breast_cancer_diagnosis_year <dbl> 2018, 2018, 2015, 2016, 2015, 20…
## $ metastatic_cancer_diagnosis_code <chr> "C779", "C7951", "C787", "C773",…
## $ metastatic_first_treatment <chr> "DOXORUBICIN HCL", "DOXORUBICIN …
## $ metastatic_first_treatment_type <chr> NA, NA, NA, "Antineoplastics", "…
## $ metastatic_first_novel_treatment <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ metastatic_first_novel_treatment_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ region <chr> "West", "Midwest", "West", "Midw…
## $ division <chr> "Pacific", "East North Central",…
## $ population <dbl> 43031, 7228, 39122, 25675, 1266,…
## $ density <dbl> 2048.578261, 194.656250, 2295.93…
## $ age_median <dbl> 38.85217, 41.24783, 38.20000, 35…
## $ age_under_10 <dbl> 11.306522, 12.855319, 11.878788,…
## $ age_10_to_19 <dbl> 12.897826, 12.789362, 13.354545,…
## $ age_20s <dbl> 14.121739, 11.261702, 14.230303,…
## $ age_30s <dbl> 13.532609, 10.489362, 13.418182,…
## $ age_40s <dbl> 13.160870, 11.859574, 13.333333,…
## $ age_50s <dbl> 13.37826, 15.27872, 14.06061, 11…
## $ age_60s <dbl> 11.473913, 13.359574, 10.248485,…
## $ age_70s <dbl> 6.380435, 6.434043, 5.951515, 6.…
## $ age_over_80 <dbl> 3.736957, 5.663830, 3.503030, 3.…
## $ male <dbl> 49.05217, 52.09149, 49.89394, 48…
## $ female <dbl> 50.94783, 47.90851, 50.10606, 51…
## $ married <dbl> 48.50435, 50.67234, 50.24545, 39…
## $ divorced <dbl> 10.117391, 14.102128, 9.827273, …
## $ never_married <dbl> 36.40870, 27.11702, 35.29091, 41…
## $ widowed <dbl> 4.969565, 8.112766, 4.651515, 4.…
## $ family_size <dbl> 3.674783, 3.119565, 3.622727, 3.…
## $ family_dual_income <dbl> 59.21957, 51.22826, 61.73636, 54…
## $ income_household_median <dbl> 86330.39, 65214.72, 102741.64, 6…
## $ income_household_under_5 <dbl> 2.226087, 2.329787, 2.327273, 4.…
## $ income_household_5_to_10 <dbl> 1.5282609, 3.2489362, 1.5363636,…
## $ income_household_10_to_15 <dbl> 2.897826, 4.468085, 2.648485, 3.…
## $ income_household_15_to_20 <dbl> 2.747826, 5.878723, 2.178788, 4.…
## $ income_household_20_to_25 <dbl> 3.173913, 5.323404, 2.409091, 5.…
## $ income_household_25_to_35 <dbl> 6.647826, 7.840426, 5.163636, 8.…
## $ income_household_35_to_50 <dbl> 9.617391, 12.246809, 7.972727, 1…
## $ income_household_50_to_75 <dbl> 15.96522, 20.04043, 13.93636, 17…
## $ income_household_75_to_100 <dbl> 13.589130, 14.051064, 12.469697,…
## $ income_household_100_to_150 <dbl> 19.75217, 15.67660, 19.76061, 13…
## $ income_household_150_over <dbl> 21.847826, 8.902128, 29.596970, …
## $ income_household_six_figure <dbl> 41.60000, 24.57872, 49.35758, 26…
## $ income_individual_median <dbl> 34317.83, 32142.23, 41287.27, 36…
## $ home_ownership <dbl> 61.39783, 72.39149, 61.46364, 58…
## $ housing_units <dbl> 12609.2609, 2789.9583, 11725.666…
## $ home_value <dbl> 572606.5, 155901.8, 677688.5, 17…
## $ rent_median <dbl> 1778.0000, 828.0000, 2003.1250, …
## $ rent_burden <dbl> 34.59565, 26.51429, 34.75312, 31…
## $ education_less_highschool <dbl> 17.491304, 15.829787, 14.230303,…
## $ education_highschool <dbl> 22.65652, 38.96809, 19.98788, 25…
## $ education_some_college <dbl> 29.26304, 27.68298, 29.79697, 25…
## $ education_bachelors <dbl> 20.200000, 11.625532, 23.739394,…
## $ education_graduate <dbl> 10.404348, 5.887234, 12.245455, …
## $ education_college_or_above <dbl> 30.60435, 17.51277, 35.98485, 37…
## $ education_stem_degree <dbl> 46.20870, 38.30889, 47.91818, 40…
## $ labor_force_participation <dbl> 63.15435, 61.27234, 65.23030, 67…
## $ unemployment_rate <dbl> 6.197826, 5.793478, 5.103030, 5.…
## $ self_employed <dbl> 15.708696, 11.202500, 15.224242,…
## $ farmer <dbl> 0.015217391, 3.715000000, 0.0272…
## $ race_white <dbl> 38.70870, 96.05532, 54.03030, 61…
## $ race_black <dbl> 3.9630435, 1.0063830, 2.5272727,…
## $ race_asian <dbl> 25.5652174, 0.3212766, 20.827272…
## $ race_native <dbl> 1.19347826, 0.11702128, 0.587878…
## $ race_pacific <dbl> 0.269565217, 0.002127660, 0.3000…
## $ race_other <dbl> 18.8586957, 0.2553191, 11.645454…
## $ race_multiple <dbl> 11.426087, 2.234043, 10.081818, …
## $ hispanic <dbl> 47.726087, 1.182979, 37.948485, …
## $ disabled <dbl> 9.895652, 18.317021, 8.957576, 1…
## $ poverty <dbl> 10.515217, 13.546809, 10.109091,…
## $ limited_english <dbl> 12.7456522, 0.1468085, 8.0575758…
## $ commute_time <dbl> 32.53043, 31.89091, 30.60606, 23…
## $ health_uninsured <dbl> 7.263043, 7.631915, 7.018182, 8.…
## $ veteran <dbl> 3.810870, 9.631915, 4.103030, 6.…
## $ treatment_pd <dbl> 35, 33, 455, 75, 393, 62, 43, 13…
ncol(training)
## [1] 83
nrow(training)
## [1] 19373
colnames(training)
## [1] "patient_id"
## [2] "patient_race"
## [3] "payer_type"
## [4] "patient_state"
## [5] "patient_zip3"
## [6] "patient_age"
## [7] "patient_gender"
## [8] "bmi"
## [9] "breast_cancer_diagnosis_code"
## [10] "breast_cancer_diagnosis_desc"
## [11] "breast_cancer_diagnosis_year"
## [12] "metastatic_cancer_diagnosis_code"
## [13] "metastatic_first_treatment"
## [14] "metastatic_first_treatment_type"
## [15] "metastatic_first_novel_treatment"
## [16] "metastatic_first_novel_treatment_type"
## [17] "region"
## [18] "division"
## [19] "population"
## [20] "density"
## [21] "age_median"
## [22] "age_under_10"
## [23] "age_10_to_19"
## [24] "age_20s"
## [25] "age_30s"
## [26] "age_40s"
## [27] "age_50s"
## [28] "age_60s"
## [29] "age_70s"
## [30] "age_over_80"
## [31] "male"
## [32] "female"
## [33] "married"
## [34] "divorced"
## [35] "never_married"
## [36] "widowed"
## [37] "family_size"
## [38] "family_dual_income"
## [39] "income_household_median"
## [40] "income_household_under_5"
## [41] "income_household_5_to_10"
## [42] "income_household_10_to_15"
## [43] "income_household_15_to_20"
## [44] "income_household_20_to_25"
## [45] "income_household_25_to_35"
## [46] "income_household_35_to_50"
## [47] "income_household_50_to_75"
## [48] "income_household_75_to_100"
## [49] "income_household_100_to_150"
## [50] "income_household_150_over"
## [51] "income_household_six_figure"
## [52] "income_individual_median"
## [53] "home_ownership"
## [54] "housing_units"
## [55] "home_value"
## [56] "rent_median"
## [57] "rent_burden"
## [58] "education_less_highschool"
## [59] "education_highschool"
## [60] "education_some_college"
## [61] "education_bachelors"
## [62] "education_graduate"
## [63] "education_college_or_above"
## [64] "education_stem_degree"
## [65] "labor_force_participation"
## [66] "unemployment_rate"
## [67] "self_employed"
## [68] "farmer"
## [69] "race_white"
## [70] "race_black"
## [71] "race_asian"
## [72] "race_native"
## [73] "race_pacific"
## [74] "race_other"
## [75] "race_multiple"
## [76] "hispanic"
## [77] "disabled"
## [78] "poverty"
## [79] "limited_english"
## [80] "commute_time"
## [81] "health_uninsured"
## [82] "veteran"
## [83] "treatment_pd"
skim(training)
Variable type: numeric
| var | n | na | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| patient_id | 19373 | 0 | 549376.82 | 260633.56 | 100051.00 | 323144.00 | 549366.00 | 776646.00 | 999996.00 |
| patient_zip3 | 19373 | 0 | 523.98 | 292.01 | 6.00 | 296.00 | 490.00 | 785.00 | 996.00 |
| patient_age | 19373 | 0 | 54.43 | 11.20 | 19.00 | 47.00 | 55.00 | 62.00 | 91.00 |
| bmi | 19373 | 13243 | 29.33 | 5.65 | 14.00 | 25.00 | 28.88 | 33.00 | 97.00 |
| breast_cancer_diagnosis_year | 19373 | 0 | 2016.49 | 1.06 | 2015.00 | 2016.00 | 2016.00 | 2017.00 | 2018.00 |
| population | 19373 | 0 | 19954.11 | 13462.14 | 636.00 | 9155.00 | 18355.00 | 27842.00 | 71374.00 |
| density | 19373 | 0 | 1656.37 | 3651.62 | 0.82 | 169.32 | 627.90 | 1530.77 | 29851.69 |
| age_median | 19373 | 0 | 40.67 | 4.01 | 20.60 | 37.73 | 40.73 | 43.14 | 57.42 |
| age_under_10 | 19373 | 0 | 11.12 | 1.54 | 0.00 | 10.15 | 11.04 | 12.19 | 17.68 |
| age_10_to_19 | 19373 | 0 | 12.87 | 1.87 | 6.31 | 11.79 | 12.92 | 13.91 | 35.30 |
| age_20s | 19373 | 0 | 13.14 | 3.25 | 5.92 | 11.00 | 12.45 | 14.48 | 62.10 |
| age_30s | 19373 | 0 | 12.78 | 2.38 | 1.50 | 11.21 | 12.38 | 13.81 | 25.47 |
| age_40s | 19373 | 0 | 12.09 | 1.24 | 0.80 | 11.36 | 12.14 | 12.91 | 17.82 |
| age_50s | 19373 | 0 | 13.57 | 1.67 | 0.00 | 12.39 | 13.64 | 14.75 | 22.91 |
| age_60s | 19373 | 0 | 12.70 | 2.55 | 0.20 | 10.69 | 12.62 | 14.09 | 24.51 |
| age_70s | 19373 | 0 | 7.70 | 2.15 | 0.00 | 6.09 | 7.38 | 8.91 | 16.27 |
| age_over_80 | 19373 | 0 | 4.02 | 1.24 | 0.00 | 3.29 | 3.82 | 4.56 | 18.82 |
| male | 19373 | 0 | 50.05 | 1.65 | 39.73 | 49.07 | 49.93 | 50.91 | 61.60 |
| female | 19373 | 0 | 49.95 | 1.65 | 38.40 | 49.09 | 50.07 | 50.93 | 60.27 |
| married | 19373 | 0 | 47.99 | 7.44 | 0.90 | 43.27 | 49.87 | 53.21 | 66.90 |
| divorced | 19373 | 0 | 12.65 | 2.09 | 0.20 | 11.07 | 12.65 | 14.19 | 21.03 |
| never_married | 19373 | 0 | 33.47 | 7.87 | 13.44 | 27.41 | 31.88 | 38.41 | 98.90 |
| widowed | 19373 | 0 | 5.89 | 1.55 | 0.00 | 4.77 | 5.60 | 6.67 | 20.65 |
| family_size | 19373 | 3 | 3.19 | 0.22 | 2.55 | 3.04 | 3.16 | 3.31 | 4.17 |
| family_dual_income | 19373 | 3 | 51.81 | 7.10 | 19.31 | 47.66 | 52.66 | 57.02 | 70.92 |
| income_household_median | 19373 | 3 | 75071.32 | 22100.49 | 18465.41 | 60923.48 | 70760.74 | 86319.88 | 164119.18 |
| income_household_under_5 | 19373 | 3 | 3.29 | 1.66 | 0.71 | 2.18 | 2.83 | 4.02 | 19.62 |
| income_household_5_to_10 | 19373 | 3 | 2.54 | 1.50 | 0.36 | 1.51 | 2.16 | 3.16 | 12.72 |
| income_household_10_to_15 | 19373 | 3 | 4.14 | 1.87 | 0.65 | 2.81 | 3.71 | 5.09 | 14.42 |
| income_household_15_to_20 | 19373 | 3 | 3.95 | 1.54 | 1.03 | 2.78 | 3.75 | 4.71 | 12.40 |
| income_household_20_to_25 | 19373 | 3 | 4.08 | 1.41 | 0.84 | 3.04 | 4.00 | 4.88 | 14.35 |
| income_household_25_to_35 | 19373 | 3 | 8.38 | 2.30 | 1.86 | 6.76 | 8.42 | 9.97 | 26.55 |
| income_household_35_to_50 | 19373 | 3 | 11.47 | 2.66 | 1.70 | 9.69 | 11.71 | 13.29 | 24.08 |
| income_household_50_to_75 | 19373 | 3 | 16.72 | 2.81 | 4.95 | 15.19 | 16.98 | 18.45 | 27.13 |
| income_household_75_to_100 | 19373 | 3 | 12.63 | 1.97 | 3.10 | 11.36 | 12.67 | 13.91 | 24.80 |
| income_household_100_to_150 | 19373 | 3 | 15.78 | 3.33 | 1.67 | 13.60 | 15.99 | 18.42 | 31.32 |
| income_household_150_over | 19373 | 3 | 17.02 | 9.45 | 0.84 | 9.97 | 14.74 | 21.73 | 52.82 |
| income_household_six_figure | 19373 | 3 | 32.80 | 11.72 | 3.44 | 24.32 | 30.66 | 40.91 | 69.03 |
| income_individual_median | 19373 | 0 | 36956.54 | 9149.77 | 4316.00 | 31232.79 | 35526.82 | 41287.27 | 88910.50 |
| home_ownership | 19373 | 3 | 66.61 | 14.20 | 15.85 | 56.90 | 71.33 | 77.42 | 90.37 |
| housing_units | 19373 | 0 | 7338.64 | 4850.21 | 0.00 | 3378.00 | 6653.95 | 10666.38 | 25922.55 |
| home_value | 19373 | 3 | 324338.37 | 238712.74 | 60629.00 | 163904.33 | 242866.07 | 378014.35 | 1853109.20 |
| rent_median | 19373 | 3 | 1225.03 | 423.92 | 448.40 | 891.74 | 1164.49 | 1467.65 | 2965.25 |
| rent_burden | 19373 | 3 | 31.28 | 4.81 | 16.69 | 28.34 | 30.83 | 33.63 | 78.94 |
| education_less_highschool | 19373 | 0 | 11.85 | 5.20 | 0.00 | 7.89 | 10.71 | 15.02 | 34.33 |
| education_highschool | 19373 | 0 | 27.99 | 7.96 | 0.00 | 22.50 | 27.60 | 33.60 | 53.84 |
| education_some_college | 19373 | 0 | 28.49 | 5.10 | 7.20 | 25.22 | 29.06 | 32.19 | 50.13 |
| education_bachelors | 19373 | 0 | 19.30 | 6.44 | 2.47 | 13.92 | 18.83 | 24.14 | 41.70 |
| education_graduate | 19373 | 0 | 12.37 | 6.21 | 2.09 | 7.49 | 10.80 | 16.11 | 51.84 |
| education_college_or_above | 19373 | 0 | 31.67 | 12.22 | 7.05 | 21.63 | 29.99 | 40.66 | 77.82 |
| education_stem_degree | 19373 | 0 | 43.25 | 4.58 | 23.91 | 40.08 | 43.02 | 45.81 | 73.00 |
| labor_force_participation | 19373 | 0 | 61.65 | 6.13 | 30.70 | 57.93 | 62.82 | 65.91 | 78.67 |
| unemployment_rate | 19373 | 0 | 5.97 | 2.03 | 0.99 | 4.76 | 5.47 | 6.74 | 18.80 |
| self_employed | 19373 | 1501 | 13.01 | 3.29 | 2.26 | 10.73 | 12.55 | 14.95 | 26.62 |
| farmer | 19373 | 1501 | 2.04 | 3.27 | 0.00 | 0.05 | 0.62 | 2.69 | 27.54 |
| race_white | 19373 | 0 | 70.26 | 17.77 | 14.50 | 57.69 | 70.99 | 85.58 | 98.44 |
| race_black | 19373 | 0 | 11.71 | 12.69 | 0.04 | 2.41 | 6.53 | 17.12 | 69.66 |
| race_asian | 19373 | 0 | 5.19 | 6.44 | 0.00 | 1.05 | 2.70 | 6.63 | 49.85 |
| race_native | 19373 | 0 | 0.86 | 2.51 | 0.00 | 0.20 | 0.34 | 0.72 | 76.93 |
| race_pacific | 19373 | 0 | 0.12 | 0.46 | 0.00 | 0.02 | 0.05 | 0.12 | 14.76 |
| race_other | 19373 | 0 | 5.38 | 5.97 | 0.00 | 1.31 | 3.36 | 7.63 | 33.19 |
| race_multiple | 19373 | 0 | 6.49 | 3.55 | 0.43 | 3.89 | 5.58 | 8.69 | 26.43 |
| hispanic | 19373 | 0 | 17.95 | 17.59 | 0.06 | 4.64 | 11.70 | 24.63 | 99.10 |
| disabled | 19373 | 0 | 13.44 | 3.82 | 4.60 | 10.27 | 13.07 | 15.61 | 35.16 |
| poverty | 19373 | 3 | 13.34 | 5.75 | 3.43 | 9.34 | 12.12 | 16.43 | 48.93 |
| limited_english | 19373 | 3 | 4.63 | 6.62 | 0.00 | 0.98 | 2.67 | 5.96 | 74.07 |
| commute_time | 19373 | 0 | 28.06 | 4.88 | 12.46 | 24.97 | 27.81 | 30.92 | 48.02 |
| health_uninsured | 19373 | 0 | 8.57 | 4.30 | 1.17 | 5.50 | 7.44 | 10.69 | 27.57 |
| veteran | 19373 | 0 | 7.08 | 3.11 | 1.20 | 4.97 | 6.88 | 8.65 | 25.20 |
| treatment_pd | 19373 | 0 | 134.52 | 188.25 | 0.00 | 36.00 | 70.00 | 133.00 | 1446.00 |
# Rows = 19373
training %>%
select(everything()) %>%
summarise_all(funs(sum(is.na(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
##
## # Simple named list: list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
##
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 1 × 83
## patient_id patient_race payer_type patient_state patient_zip3 patient_age
## <int> <int> <int> <int> <int> <int>
## 1 0 10614 1982 52 0 0
## # ℹ 77 more variables: patient_gender <int>, bmi <int>,
## # breast_cancer_diagnosis_code <int>, breast_cancer_diagnosis_desc <int>,
## # breast_cancer_diagnosis_year <int>, metastatic_cancer_diagnosis_code <int>,
## # metastatic_first_treatment <int>, metastatic_first_treatment_type <int>,
## # metastatic_first_novel_treatment <int>,
## # metastatic_first_novel_treatment_type <int>, region <int>, division <int>,
## # population <int>, density <int>, age_median <int>, age_under_10 <int>, …
training %>%
na.omit() %>%
summarize(across(everything(), min))
training %>%
na.omit() %>%
summarize(across(everything(), max))
training %>%
select(patient_age) %>%
arrange((patient_age))
training %>%
count(patient_gender)
training %>%
count(patient_state) %>%
arrange(desc(n))
training %>%
select(metastatic_first_novel_treatment) %>%
summary(NA) # 13243 NA's
training %>%
count(metastatic_first_novel_treatment)
training %>%
select(metastatic_first_novel_treatment_type) %>%
summary(NA)
training %>%
count(metastatic_first_novel_treatment_type)
training %>%
select(bmi) %>%
summary(NA)
training %>%
count(bmi)
training %>%
select(patient_race) %>%
summary(NA)
training %>%
count(patient_race)
training %>%
select(metastatic_first_novel_treatment_type) %>%
summary(NA)
training %>%
count(metastatic_first_treatment_type)
training %>%
count(breast_cancer_diagnosis_desc) %>%
arrange(desc(n))
training %>%
count(metastatic_cancer_diagnosis_code)
# removed variables with too many missing values
training <- training %>%
mutate(metastatic_first_novel_treatment = NULL,
metastatic_first_novel_treatment_type = NULL,
patient_race = NULL,
payer_type = NULL,
bmi = NULL,
metastatic_first_treatment_type = NULL,
self_employed = NULL,
farmer = NULL)
training %>%
count(breast_cancer_diagnosis_code) %>%
arrange(desc(n))
training %>%
count(metastatic_cancer_diagnosis_code) %>%
arrange(desc(n))
training %>%
count(metastatic_first_treatment)
finding_side <- training %>%
select(breast_cancer_diagnosis_desc)
finding_side <- deframe(finding_side)
left_side <- str_extract(finding_side, "left")
any <- str_extract(finding_side, "left")
enframe(left_side)
right_side <- str_extract(finding_side, "right")
enframe(right_side)
unspecified_side <- str_extract(finding_side, "unspecified")
enframe(unspecified_side)
finding_side <- training %>%
select(breast_cancer_diagnosis_desc)
finding_side <- deframe(finding_side)
training <- training %>%
mutate(breast_cancer_diagnosis_side = case_when(
str_detect(finding_side, "left") ~ "left",
str_detect(finding_side, "right") ~ "right",
TRUE ~ "unspecified"
))
training %>%
count(breast_cancer_diagnosis_side)
## # A tibble: 3 × 2
## breast_cancer_diagnosis_side n
## <chr> <int>
## 1 left 7394
## 2 right 7167
## 3 unspecified 4812
# left = 7394
# right = 7167
# unspecified = 4812
training %>%
count(metastatic_first_treatment)
## # A tibble: 37 × 2
## metastatic_first_treatment n
## <chr> <int>
## 1 BEVACIZUMAB 154
## 2 BLEOMYCIN SULFATE 1
## 3 CAPECITABINE 1121
## 4 CARBOPLATIN 3158
## 5 CISPLATIN 78
## 6 CYCLOPHOSPHAMIDE 2920
## 7 DOCETAXEL 817
## 8 DOCETAXEL ANHYDROUS 83
## 9 DOXORUBICIN HCL 6895
## 10 DOXORUBICIN HCL LIPOSOMAL 123
## # ℹ 27 more rows
training <- training %>%
mutate(metastatic_first_treatment = case_when(
metastatic_first_treatment == "DOCETAXEL ANHYDROUS" ~ "DOCETAXEL",
metastatic_first_treatment == "DOXORUBICIN HCL LIPOSOMAL" ~ "DOXORUBICIN HYDROCHLORIDE",
metastatic_first_treatment == "DOXORUBICIN HCL" ~ "DOXORUBICIN HYDROCHLORIDE",
metastatic_first_treatment == "EPIRUBICIN HCL" ~ "EPIRUBICIN HYDROCHLORIDE",
metastatic_first_treatment == "GEMCITABINE HCL" ~ "GEMCITABINE HYDROCHLORIDE",
metastatic_first_treatment == "METHOTREXATE" ~ "METHOTREXATE SODIUM",
metastatic_first_treatment == "PACLITAXEL PROTEIN BOUND PARTICLES" ~ "PACLITAXEL",
metastatic_first_treatment == "PEMETREXED DISODIUM HEPTAHYDRATE" ~ "PEMETREXED DISODIUM",
.default = metastatic_first_treatment
))
training <- training %>%
mutate(metastatic_cancer_body_system = case_when(
metastatic_cancer_diagnosis_code %in% c("C770", "C771", "C772", "C773", "C774", "C775", "C778", "C779") ~ "lymphatic",
metastatic_cancer_diagnosis_code %in% c("C7800", "C7801", "C7802", "C781", "C782", "C7839") ~ "respiratory",
metastatic_cancer_diagnosis_code %in% c("C784", "C785", "C786", "C787", "C7889") ~ "digestive",
metastatic_cancer_diagnosis_code %in% c("C7900", "C7901", "C7902", "C7910") ~ "urinary",
metastatic_cancer_diagnosis_code %in% c("C792", "C7931", "C7932", "C7940", "C7949") ~ "nervous",
metastatic_cancer_diagnosis_code %in% c("C7951", "C7952") ~ "skeletal",
metastatic_cancer_diagnosis_code %in% c("C7960", "C7961", "C7962", "C7981", "C7982") ~ "reproductive",
metastatic_cancer_diagnosis_code %in% c("C7970", "C7971", "C7972") ~ "adrenal glands",
metastatic_cancer_diagnosis_code %in% c("C7989", "C799") ~ "other",
))
training %>%
count(metastatic_cancer_body_system)
## # A tibble: 9 × 2
## metastatic_cancer_body_system n
## <chr> <int>
## 1 adrenal glands 10
## 2 digestive 753
## 3 lymphatic 14347
## 4 nervous 390
## 5 other 777
## 6 reproductive 400
## 7 respiratory 751
## 8 skeletal 1938
## 9 urinary 7
library(mice)
imputeData <- training %>%
select(treatment_pd, family_size, family_dual_income, income_household_median, income_household_under_5, income_household_5_to_10, income_household_10_to_15, income_household_15_to_20, income_household_20_to_25, income_household_25_to_35, income_household_35_to_50,
income_household_50_to_75, income_household_75_to_100, income_household_100_to_150, income_household_150_over, income_household_six_figure,
home_ownership, home_value, rent_median, rent_burden)
md.pattern(imputeData)
library(VIM)
aggr_plot <- aggr(imputeData, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(imputeData), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
tempImputeData <- mice(imputeData, m=5, meth='pmm', seed=500)
summary(tempImputeData)
completeImputed <- complete(tempImputeData)
sapply(completeImputed, function(x) sum(is.na(x)))
# treatment pd -- very right skewed
training %>%
ggplot(mapping = aes(x = treatment_pd)) +
geom_boxplot()
training %>%
filter(treatment_pd < 500) %>%
ggplot(mapping = aes(x = treatment_pd)) +
geom_boxplot() +
labs(x = "Treatment Period (days)",
title = "Distribution of Period Between Diagnosis and Treatment")
training %>%
filter(treatment_pd < 200) %>%
ggplot(mapping = aes(x = treatment_pd)) +
geom_boxplot()
# region
training %>%
ggplot(mapping = aes(x = region)) +
geom_bar()
# divisiom
training %>%
ggplot(mapping = aes(y = division)) +
geom_bar()
# patient age -- pretty normal
training %>%
ggplot(mapping = aes(x = patient_age)) +
geom_boxplot() +
labs(x = "Age (Years)",
title = "Distribution of Patient Age")
# density -- very right skewed
training %>%
filter(density <= 10000) %>%
ggplot(mapping = aes(x = density)) +
geom_boxplot()
# female -- normal
training %>%
ggplot(mapping = aes(x = female)) +
geom_boxplot()
# health uninsured -- little right skewed
training %>%
ggplot(mapping = aes(x = health_uninsured)) +
geom_boxplot()
# race white -- little left skewed
training %>%
ggplot(mapping = aes(x = race_white)) +
geom_boxplot()
# limited english -- right skewed
training %>%
filter(limited_english < 40) %>%
ggplot(mapping = aes(x = limited_english)) +
geom_boxplot()
# metastatic cancer body system
training %>%
ggplot(mapping = aes(x = metastatic_cancer_body_system)) +
geom_bar() +
labs(x = "Body System",
title = "Distribution of Body System with Metastatic Cancer")
# metastatic first treatment
training %>%
count(metastatic_first_treatment) %>%
filter(n > 1000)
## # A tibble: 5 × 2
## metastatic_first_treatment n
## <chr> <int>
## 1 CAPECITABINE 1121
## 2 CARBOPLATIN 3158
## 3 CYCLOPHOSPHAMIDE 2920
## 4 DOXORUBICIN HYDROCHLORIDE 7200
## 5 PACLITAXEL 2494
training %>%
select(metastatic_first_treatment) %>%
filter(metastatic_first_treatment %in% c('CAPECITABINE','CARBOPLATIN','CYCLOPHOSPHAMIDE','DOXORUBICIN HYDROCHLORIDE', 'PACLITAXEL'))%>%
ggplot(mapping = aes(x = metastatic_first_treatment)) +
geom_bar() +
labs(x = "Treatment Drug",
y = "Number of Patients",
title = "Top 5 Metastatic Cancer Treatment Drugs")
# income household median
training %>%
ggplot(mapping = aes(x = income_household_median)) +
geom_boxplot()
## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).
# home ownership -- left skewed
training %>%
ggplot(mapping = aes(x = home_ownership)) +
geom_boxplot()
## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).
# rent burden -- somewhat normal, a little right skewed
training %>%
filter(rent_burden < 70) %>%
ggplot(mapping = aes(x = rent_burden)) +
geom_boxplot()
# education high school -- normal
training %>%
ggplot(mapping = aes(x = education_highschool)) +
geom_boxplot()
# unemployment rate -- right skewed
training %>%
ggplot(mapping = aes(x = unemployment_rate)) +
geom_boxplot()
# poverty -- right skewed
training %>%
ggplot(mapping = aes(x = poverty)) +
geom_boxplot()
## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).
# colored by region
# patient age
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color=region)) +
geom_point()
# median income household for zip code
# more south in lower income median
ggplot(data = training, mapping = aes(x = income_household_median, y = treatment_pd, color=region)) +
geom_point()
## Warning: Removed 3 rows containing missing values (`geom_point()`).
# population for zip code
# outliers are in northeast, more midwest in lower population
ggplot(data = training, mapping = aes(x = population, y = treatment_pd, color=region)) +
geom_point()
# median age for zip code
ggplot(data = training, mapping = aes(x = age_median, y = treatment_pd, color=region)) +
geom_point()
# unemployement rate for zip code
#
ggplot(data = training, mapping = aes(x = unemployment_rate, y = treatment_pd, color=region)) +
geom_point()
# patient age for zip code
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color=region)) +
geom_point()
# all of them -- unspecified more with higher treatment period
# patient age
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
geom_point()
# median income household for zip code
ggplot(data = training, mapping = aes(x = income_household_median, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
geom_point()
## Warning: Removed 3 rows containing missing values (`geom_point()`).
# population for zip code
ggplot(data = training, mapping = aes(x = population, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
geom_point()
# median age for zip code
ggplot(data = training, mapping = aes(x = age_median, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
geom_point()
# unemployement rate for zip code
ggplot(data = training, mapping = aes(x = unemployment_rate, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
geom_point()
# patient age for zip code
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
geom_point()
# all -- lymphatic areas are more around shorter treatment periods
# patient age
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color= metastatic_cancer_body_system)) +
geom_point()
# median income household for zip code
ggplot(data = training, mapping = aes(x = income_household_median, y = treatment_pd, color= metastatic_cancer_body_system)) +
geom_point()
## Warning: Removed 3 rows containing missing values (`geom_point()`).
# population for zip code
ggplot(data = training, mapping = aes(x = population, y = treatment_pd, color= metastatic_cancer_body_system)) +
geom_point()
# median age for zip code
ggplot(data = training, mapping = aes(x = age_median, y = treatment_pd, color= metastatic_cancer_body_system)) +
geom_point()
# unemployement rate for zip code
ggplot(data = training, mapping = aes(x = unemployment_rate, y = treatment_pd, color= metastatic_cancer_body_system)) +
geom_point()
# patient age for zip code
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color= metastatic_cancer_body_system)) +
geom_point()
training %>%
filter(treatment_pd < 1000) %>%
ggplot(mapping = aes(y = metastatic_cancer_body_system, x = treatment_pd)) +
geom_boxplot()
training %>%
select(metastatic_first_treatment, treatment_pd) %>%
filter(metastatic_first_treatment %in% c('CAPECITABINE','CARBOPLATIN','CYCLOPHOSPHAMIDE','DOXORUBICIN HYDROCHLORIDE', 'PACLITAXEL')) %>%
ggplot(mapping = aes(y = metastatic_first_treatment, x = treatment_pd)) +
geom_boxplot()
training %>%
select() %>%
ggpairs()